Load dataset

Filter genes

There seem to be two distributions within the data, one with lower gene expression and the other with higher, we want to keep the genes that belong to the higher distribution

plot_data = data.frame('meanExpr'=rowMeans(datExpr))

ggplotly(plot_data %>% ggplot(aes(meanExpr)) + geom_density(alpha=0.5, color='#0099cc', fill='#0099cc') + 
         ggtitle('Mean Expression distribution') + theme_minimal())

Fitting a GMM with gaussian_comps = 2 and keeping all genes with a bigger likelihood of belonging to the higher Gaussian (mean expression > 5.259805)

GMM_mix = plot_data %>% GMM(2)

GMM_assoc = GMM_mix$Log_likelihood %>% apply(1, function(x) which.max(x))

pca_datExpr = prcomp(datExpr)$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% mutate('GMM_assoc' = as.factor(GMM_assoc))

plot_gaussians = plot_data %>% ggplot(aes(x=meanExpr)) +
  stat_function(fun=dnorm, n=100, colour='#F8766D', args=list(mean=GMM_mix$centroids[1], sd=GMM_mix$covariance_matrices[1])) +
  stat_function(fun=dnorm, n=100, colour='#00BFC4', args=list(mean=GMM_mix$centroids[2], sd=GMM_mix$covariance_matrices[2])) +
  theme_minimal()

plot_pca = pca_datExpr %>% ggplot(aes(PC1, PC2, color=GMM_assoc)) + geom_point(alpha=0.3) + theme_minimal()

grid.arrange(plot_gaussians, plot_pca, ncol=2, widths=c(0.4, 0.6))

table(GMM_assoc)
## GMM_assoc
##     1     2 
## 15042 15065
rownames_datExpr = rownames(datExpr)
datExpr = datExpr %>% filter(GMM_assoc==2)
rownames(datExpr) = rownames_datExpr[GMM_assoc==2]

rm(GMM_mix, GMM_assoc, pca_datExpr, plot_gaussians, plot_pca, plot_data, rownames_datExpr)
plot_data = data.frame('meanExpr'=rowMeans(datExpr))

ggplotly(plot_data %>% ggplot(aes(meanExpr)) + geom_density(alpha=0.5, color='#0099cc', fill='#0099cc') + 
         ggtitle('Mean Expression distribution of remaining genes') + theme_minimal())

Perform Clustering

Pipeline:



Define a family of adjacency functions

  • Sigmoid function: \(a(i,j)=sigmoid(s_{ij}, \alpha, \tau_0) \equiv \frac{1}{1+e^{-\alpha(s_{ij}-\tau_0)}}\)

  • Power adjacency function: \(a(i,j)=power(s_{ij}, \beta) \equiv |S_{ij}|^\beta\)

Chose power adjacency function over the sigmoid function because it has only one parameter to adjust and both methods are supposed to lead to very similar results if the parameters are chosen with the scale-free topology criterion.

Choosing a parameter value

Following the scale-free topology criterion because metabolic networks have been found to display approximate scale free topology

  1. Only consider those parameter values that lead to a network satisfying scale-free topology at least approximately, e.g. signed \(R^2 > 0.80\)
best_power = datExpr %>% t %>% pickSoftThreshold(powerVector = 1:15, RsquaredCut=0.8)
## Warning: executing %dopar% sequentially: no parallel backend registered
##    Power SFT.R.sq  slope truncated.R.sq  mean.k. median.k.  max.k.
## 1      1   0.0217 -0.995          0.966 2.15e+03  2.11e+03 3400.00
## 2      2   0.3730 -2.500          0.978 4.81e+02  4.54e+02 1180.00
## 3      3   0.6840 -3.000          0.992 1.37e+02  1.22e+02  510.00
## 4      4   0.8060 -3.140          0.998 4.62e+01  3.80e+01  252.00
## 5      5   0.8550 -3.090          0.998 1.76e+01  1.32e+01  137.00
## 6      6   0.8870 -2.920          0.998 7.47e+00  4.98e+00   80.20
## 7      7   0.9030 -2.730          0.998 3.45e+00  2.01e+00   49.50
## 8      8   0.9140 -2.550          0.997 1.71e+00  8.59e-01   31.90
## 9      9   0.9200 -2.390          0.993 9.10e-01  3.83e-01   21.30
## 10    10   0.9380 -2.230          0.998 5.12e-01  1.79e-01   14.60
## 11    11   0.9390 -2.100          0.997 3.03e-01  8.60e-02   10.30
## 12    12   0.9450 -1.960          0.993 1.88e-01  4.24e-02    7.41
## 13    13   0.9530 -1.840          0.991 1.21e-01  2.15e-02    5.43
## 14    14   0.9650 -1.740          0.990 8.13e-02  1.11e-02    4.14
## 15    15   0.9260 -1.800          0.975 5.62e-02  5.87e-03    3.75
print(paste0('Best power for scale free topology: ', best_power$powerEstimate))
## [1] "Best power for scale free topology: 4"
S_sft = datExpr %>% t %>% adjacency(type='unsigned', power=best_power$powerEstimate, corFnc='bicor')




Defining a measure of node dissimilarity

Using topological overlap dissimilarity measure because it has been found to result in biologically meaningful modules

1st quartile is already 0.9852, most of the genes are very dissimilar

TOM = S_sft %>% TOMsimilarity
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
dissTOM = 1-TOM

rownames(dissTOM) = rownames(S_sft)
colnames(dissTOM) = colnames(S_sft)

rm(S_sft, TOM)




Identifying gene modules

Using hierarchical clustering using average linkage on the TOM-based dissimilarity matrix

dend = dissTOM %>% as.dist %>% hclust(method='average')
plot(dend, hang=0, labels=FALSE)

Instead of using a fixed height to cut the dendrogram into clusters, using a dynamic branch cutting approach taken from Dynamic Tree Cut: in-depth description, tests and applications

Dynamic Tree Cut: top-down algorithm relying only on the dendrogram and respecting the order of the clustered objects on it. This method is less sensitive to parameter choice but also less flexible and it performed better in our previous experiments but when using it on this dataset it left most genes (8558) without a cluster, so tried doing it also with the Dynamic Hybrid algorithm

Dynamic Hybrid Cut: builds the clusters from bottom up. In addition to information from the dendrogram, it utilizes dissimilarity information among the objects. Seems to me that relies on too many heuristics and has too many parameters to tune. Ran it with the default settings

Dynamic Tree Cut Algorithm

A lot of genes (56%) are left without a cluster. On previous experiments this method left genes too close to the root unclassified, so I’ll see if it’s that’s what’s happening and if the other modules make sense

modules = cutreeDynamic(dend, method = 'tree', minClusterSize = 10)

table(modules)
## modules
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 8558  301  265  225  207  169  142  135  134  133  128  126  119  115  107 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29 
##  104  103   94   89   89   87   79   76   70   68   67   66   66   66   64 
##   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44 
##   63   60   59   58   56   55   54   53   52   51   51   49   48   48   42 
##   45   46   47   48   49   50   51   52   53   54   55   56   57   58   59 
##   41   40   40   39   38   38   38   38   37   36   36   36   36   36   35 
##   60   61   62   63   64   65   66   67   68   69   70   71   72   73   74 
##   34   34   33   32   30   30   29   28   28   28   28   27   26   26   26 
##   75   76   77   78   79   80   81   82   83   84   85   86   87   88   89 
##   25   25   24   23   22   22   22   22   21   21   21   21   21   20   20 
##   90   91   92   93   94   95   96   97   98   99  100  101  102  103  104 
##   20   19   19   19   19   18   18   17   17   16   16   16   15   15   15 
##  105  106  107  108  109  110  111  112  113  114  115  116  117  118  119 
##   15   15   15   15   14   14   14   14   14   14   14   13   13   13   12 
##  120  121  122  123  124  125  126  127  128  129  130  131  132  133  134 
##   12   12   12   12   12   12   12   12   11   11   11   11   11   11   11 
##  135  136  137  138  139  140  141  142  143  144  145  146  147  148  149 
##   11   11   11   11   11   11   11   11   11   11   10   10   10   10   10 
##  150  151  152  153  154  155  156  157 
##   10   10   10   10   10   10   10   10
# Note: The modules are ordered as in the rows in datExpr

Merging similar modules

# Calculate eigengenes
MEList = datExpr %>% t %>% moduleEigengenes(colors = modules)
MEs = MEList$eigengenes

# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs)

# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = 'average')

METree %>% as.dendrogram %>% plot(main = 'Clustering of module eigengenes', leaflab = 'none')
abline(h=1, col='#0099cc')
abline(h=0.5, col='#009999')

merge_top = datExpr %>% t %>% mergeCloseModules(modules, cutHeight = 1)
##  mergeCloseModules: Merging modules whose distance is less than 1
##    Calculating new MEs...
merge_similar = datExpr %>% t %>% mergeCloseModules(modules, cutHeight = 0.5)
##  mergeCloseModules: Merging modules whose distance is less than 0.5
##    Calculating new MEs...
rm(MEList, MEs, MEDiss, METree)
module_colors = c('gray',viridis(length(unique(modules))-1))
names(module_colors) = modules %>% table %>% names

merged_module_colors = c('gray',viridis(length(unique(merge_similar$colors))-1))
names(merged_module_colors) = merge_similar$colors %>% table %>% names

top_module_colors = c('gray',viridis(length(unique(merge_top$colors))-1))
names(top_module_colors) = merge_top$colors %>% table %>% names

dend_colors = data.frame('ID' = rownames(datExpr),
                         'OriginalModules' = module_colors[as.character(modules)],
                         'MergedModules' = merged_module_colors[as.character(merge_similar$colors)],
                         'TopModules' = top_module_colors[as.character(merge_top$colors)])

dend %>% as.dendrogram(hang=0) %>% plot(ylim=c(min(dend$height),1), leaflab='none')
colored_bars(colors=dend_colors[dend$order,-1])

rm(module_colors, merged_module_colors, top_module_colors)
modules_dynamic_tree = dend_colors

rm(dend_colors)

Dynamic Hybrid Cut Algorithm

modules = cutreeDynamic(dend, minClusterSize = 10, distM = dissTOM)
##  ..cutHeight not given, setting it to 0.997  ===>  99% of the (truncated) height range in dendro.
##  ..done.
table(modules)
## modules
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
##  167 1447 1384  669  612  446  432  427  383  335  324  315  263  253  252 
##   15   16   17   18   19   20   21   22   23   24   25   26   27   28   29 
##  248  226  224  214  213  206  206  196  191  191  189  186  171  168  165 
##   30   31   32   33   34   35   36   37   38   39   40   41   42   43   44 
##  159  159  158  155  152  148  146  137  133  124  124  121  119  116  113 
##   45   46   47   48   49   50   51   52   53   54   55   56   57   58   59 
##  108  106   85   82   79   77   71   69   68   68   66   66   64   62   61 
##   60   61   62   63   64   65   66   67   68   69   70   71   72   73   74 
##   60   59   56   55   55   51   47   46   43   43   42   42   41   40   40 
##   75   76   77   78   79   80   81   82   83   84   85   86   87   88   89 
##   40   39   37   34   30   29   29   28   27   26   24   24   23   21   20 
##   90 
##   15

Merging similar modules

# Calculate eigengenes
MEList = datExpr %>% t %>% moduleEigengenes(colors = modules)
MEs = MEList$eigengenes

# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs)

# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = 'average')

METree %>% as.dendrogram %>% plot(main = 'Clustering of module eigengenes', leaflab = 'none')
abline(h=1, col='#0099cc')
abline(h=0.35, col='#009999')

merge_top = datExpr %>% t %>% mergeCloseModules(modules, cutHeight = 1)
##  mergeCloseModules: Merging modules whose distance is less than 1
##    Calculating new MEs...
merge_similar = datExpr %>% t %>% mergeCloseModules(modules, cutHeight = 0.38)
##  mergeCloseModules: Merging modules whose distance is less than 0.38
##    Calculating new MEs...
rm(MEList, MEs, MEDiss, METree)

Classification is quite noisy

module_colors = c('gray',viridis(length(unique(modules))-1))
names(module_colors) = modules %>% table %>% names

merged_module_colors = c('gray',viridis(length(unique(merge_similar$colors))-1))
names(merged_module_colors) = merge_similar$colors %>% table %>% names

top_module_colors = c('gray',viridis(length(unique(merge_top$colors))-1))
names(top_module_colors) = merge_top$colors %>% table %>% names

dend_colors = data.frame('ID' = rownames(datExpr),
                         'OriginalModules' = module_colors[as.character(modules)],
                         'MergedModules' = merged_module_colors[as.character(merge_similar$colors)],
                         'TopModules' = top_module_colors[as.character(merge_top$colors)])

dend %>% as.dendrogram(hang=0) %>% plot(ylim=c(min(dend$height),1), leaflab='none')
colored_bars(colors=dend_colors[dend$order,-1])

rm(module_colors, merged_module_colors, top_module_colors)
modules_dynamic_hybrid = dend_colors

rm(dend_colors)